In [1]:
"""
Readme: every sessions before "with tf.Session(config=sess_config) as sess:" are for preparing reasons only,
therefore you only need to run these functions ONCE
for further tuning, you can change the parameters starts from "with tf.Session(config=sess_config) as sess"

if you want to predict single photo, change src/config.py DEMO_DYPE = image_path
if you want to predict photos in a folder, change src/config.py DEMO_DYPE = image_path,
  change variable position to the range, for example, [1,3] means predict the 1-th to 3-th images in the dir
if you want to predict a mp4 video, change src/config.py DEMO_DYPE = video_path,
  notice that run video will take much longer time
"""
Out[1]:
'\nReadme: every sessions before "with tf.Session(config=sess_config) as sess:" are for preparing reasons only,\ntherefore you only need to run these functions ONCE\nfor further tuning, you can change the parameters starts from "with tf.Session(config=sess_config) as sess"\n\nif you want to predict single photo, change src/config.py DEMO_DYPE = image_path\nif you want to predict photos in a folder, change src/config.py DEMO_DYPE = image_path,\n  change variable position to the range, for example, [1,3] means predict the 1-th to 3-th images in the dir\nif you want to predict a mp4 video, change src/config.py DEMO_DYPE = video_path,\n  notice that run video will take much longer time\n'
In [2]:
import warnings
warnings.simplefilter("ignore")
import tensorflow as tf
import numpy as np
import cv2
import time
import math
import importlib
from PIL import Image
from matplotlib import pyplot as plt
from pathlib import Path
In [3]:
import os
import sys
sys.path.insert(0, 'src')
tensorflow_path = "src/phase1_models"
sys.path.append(tensorflow_path + "/research")
sys.path.append(tensorflow_path + "/research/object_detection")
sys.path.append(tensorflow_path + "/research/object_detection/utils")
sys.path.append(tensorflow_path + "/research/object_detection/utils/slim")
from config import FLAGS
import Detector as dt
import src.video2jpg as v2j
import src.jpg2video as j2v
In [4]:
#None means select all img in a folder, [a, b] means to select a-th to b-th images in the folder
#This variable is for folder prediction only
position = None
In [5]:
#import convolutional pose machine model
cpm_model = importlib.import_module('src.phase2_models.nets.' + FLAGS.network_def)
In [6]:
os.environ['CUDA_VISIBLE_DEVICES'] = str(FLAGS.gpu_id)

#build network
model = cpm_model.CPM_Model()
saver = tf.train.Saver()
WARNING:tensorflow:From /home/shared/anaconda3/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [7]:
#GPU settings
output_node = tf.get_default_graph().get_tensor_by_name(name=FLAGS.output_node_names)

device_count = {'GPU': 1} if FLAGS.use_gpu else {'GPU': 0}
sess_config = tf.ConfigProto(device_count=device_count)
sess_config.gpu_options.per_process_gpu_memory_fraction = 0.2
sess_config.gpu_options.allow_growth = True
sess_config.allow_soft_placement = True
In [8]:
def correct_and_draw_hand(full_img, stage_heatmap, crop_img, original_info):
    global_joint = np.zeros((FLAGS.total_joints, 2))
    local_joint = np.zeros((FLAGS.total_joints, 2))

    mean_likelihood = 0.0

    # Plot joint colors
    for joint_num in range(FLAGS.total_joints):
        #this takes the last heatmap
        tmp_heatmap = stage_heatmap[:, :, joint_num]

        #find which position have the largest possibiility to be this joint, and rewrite in form of length-2 list
        joint_coord = np.unravel_index(np.argmax(tmp_heatmap),
                                       (FLAGS.image_size, FLAGS.image_size))
        #for each joint, add its highest(also the taken) score to total response
        mean_likelihood += tmp_heatmap[joint_coord[0], joint_coord[1]]
        joint_coord = np.array(joint_coord).astype(np.float32)

        local_joint[joint_num, :] = joint_coord

        # Resize back
        joint_coord *= original_info[2] / FLAGS.image_size

        joint_coord[0] += original_info[0]
        joint_coord[1] += original_info[1]
        global_joint[joint_num, :] = joint_coord

    #if mean_likelihood >= 5:
    #if we feel like it fails, then we will don't place hand on the screen
    draw_hand(full_img, global_joint)
    draw_hand(crop_img, local_joint)


    #we put response on crop image, and original image
    cv2.putText(crop_img, 'Likelihood: {:<.3f}'.format(mean_likelihood / FLAGS.total_joints),
                org=(20, 20), fontFace=cv2.FONT_HERSHEY_PLAIN, fontScale=1, color=(255, 0, 0))
    cv2.putText(full_img, 'Likelihood: {:<.3f}'.format(mean_likelihood / FLAGS.total_joints),
            org=(original_info[1], original_info[0]), fontFace=cv2.FONT_HERSHEY_PLAIN, fontScale=1, color=(255, 0, 0))
In [9]:
#draw hand points and circles on the canvas
def draw_hand(canvas, joint): 
    font = cv2.FONT_HERSHEY_SIMPLEX
    for i in range(len(joint)):
        cv2.circle(canvas, (int(joint[i][1]),int(joint[i][0])), 4, FLAGS.joint_color_code[i], thickness=-1)
    for edge in FLAGS.limbs:
        u,v = edge
        cv2.line(canvas,(int(joint[u][1]),int(joint[u][0])),(int(joint[v][1]),int(joint[v][0])),FLAGS.joint_color_code[v],3)

def normalize_and_centralize_img(img):
    #input from -0.5~+0.5
    norm_img = img / 256.0 - 0.5
    return np.expand_dims(norm_img, axis=0)
In [10]:
def predict_image(padding, position=None):
    print("Prediction starts...")
    test_img = [] #store all the images in this directory, no matter a folder of photos or a single photo
    img_name = [] #list for image names
    if FLAGS.DEMO_TYPE.endswith(('png', 'jpg')):
        test_img.append(cv2.imread(FLAGS.DEMO_TYPE))
        img_name.append(FLAGS.DEMO_TYPE)
    elif FLAGS.DEMO_TYPE.endswith('mp4'):
        for file in os.listdir('src/video_dir/sequence_img'):
            os.remove('src/video_dir/sequence_img/' + file)
        for file in os.listdir('src/video_dir/labeled_img'):
            os.remove('src/video_dir/labeled_img/' + file)  
        if Path('src/video_dir/output.mp4').is_file():
            os.remove('src/video_dir/output.mp4')
        print("Old intermediate file removed, if any")
        v2j.extractImages(FLAGS.DEMO_TYPE, 'src/video_dir/sequence_img', 'src/video_dir/labeled_img')
        file_list = sorted(os.listdir('src/video_dir/sequence_img'))
        for img in file_list:
            test_img.append(cv2.imread(os.path.join('src/video_dir/sequence_img', img)))
            img_name.append(os.path.join('src/video_dir/sequence_img', img))
    else:
        file_list = sorted(os.listdir(FLAGS.DEMO_TYPE))
        if position!= None:
            file_list = file_list[position[0]:position[1]]

        for img in file_list:
            if not img.endswith(('png', 'jpg')):
                #avoid error from ./checkpoint
                continue
            test_img.append(cv2.imread(os.path.join(FLAGS.DEMO_TYPE, img)))
            img_name.append(os.path.join(FLAGS.DEMO_TYPE, img))
    tt = time.time()
    if FLAGS.DEMO_TYPE.endswith('mp4'):
        boxes = dt.bounding_box_from_folder('src/video_dir/sequence_img', padding, position)
    else:
        boxes = dt.bounding_box_from_folder(FLAGS.DEMO_TYPE, padding, position)
    print("Time for Phase 1 Hand Detection: ", time.time() - tt)

    tmp_img_list = []
    for index, b_box in enumerate(boxes):
        t0 = time.time()
        bb_img = []
        bb_img_resize = []
        original_info = [] #so that can be recovered from original size and position, with order: ymin, xmin, original length/width
        if b_box != []: 
            bb_box = b_box[0]
            bb_img.append(np.copy(test_img[index][bb_box[1]:bb_box[3],bb_box[0]:bb_box[2],:]))
            original_info.append([bb_box[1], bb_box[0], bb_img[0].shape[0]])
            if len(b_box) > 1: 
                bb_box = b_box[1]
                bb_img.append(np.copy(test_img[index][bb_box[1]:bb_box[3],bb_box[0]:bb_box[2],:])) 
                original_info.append([bb_box[1], bb_box[0], bb_img[1].shape[0]])

        # this is useless, because it already forces to input size
        for i in range(len(bb_img)):
            bb_img[i] = cv2.resize(bb_img[i], (FLAGS.image_size, FLAGS.image_size))
            bb_img_resize.append(normalize_and_centralize_img(bb_img[i]))

            #t1 = time.time()

            stage_heatmap_np = sess.run(output_node,
                                        feed_dict={model.input_images:bb_img_resize[i]})
            #frame per second
            #print('fps: %.2f' % (1 / (time.time() - t1)))

            correct_and_draw_hand(test_img[index],
                                  cv2.resize(stage_heatmap_np[0], (FLAGS.image_size, FLAGS.image_size)),
                                  bb_img[i], original_info[i])

        # Show visualized image
        tmp_img = cv2.resize(test_img[index], (1280, 720))      
        #cv2.imshow(img_name[index] + '-0', tmp_img.astype(np.uint8))
        tmp_img_list.append(tmp_img)
        print("Gesture estimation time for " + str(index) + ": " +str(time.time() - t0))
    return tmp_img_list
In [11]:
with tf.Session(config=sess_config) as sess:

    model_path_suffix = 'stages_{}'.format(FLAGS.total_stages)
    # the folder to save and reload weight
    model_save_dir = os.path.join('src/phase2_models',
                                  'weights',
                                  model_path_suffix)
    print('Load model from [{}]'.format(os.path.join(model_save_dir, FLAGS.pretrained_model)))

    if FLAGS.pretrained_model != '':
        saver.restore(sess, os.path.join(model_save_dir, FLAGS.pretrained_model))
              
    t1 = time.time()
    #we assume they are all images, either folder or single image file
    #if FLAGS.DEMO_TYPE.endswith(('png', 'jpg')): 
    predictions = predict_image(FLAGS.padding, None) # the second argument works as range [x, y] indicates from x-th to y-th for a folder
    print("The total time we use is: ", time.time() - t1)
    #cv2.waitKey(0)
Load model from [src/phase2_models/weights/stages_3/cpm_model-200000]
INFO:tensorflow:Restoring parameters from src/phase2_models/weights/stages_3/cpm_model-200000
Prediction starts...
Old intermediate file removed, if any
Extraction starts
149  images have been generated
Time for Phase 1 Hand Detection:  202.79252099990845
Gesture estimation time for 0: 0.7613146305084229
Gesture estimation time for 1: 0.1374983787536621
Gesture estimation time for 2: 0.12679743766784668
Gesture estimation time for 3: 0.12372446060180664
Gesture estimation time for 4: 0.11955142021179199
Gesture estimation time for 5: 0.1172182559967041
Gesture estimation time for 6: 0.11911177635192871
Gesture estimation time for 7: 0.11799836158752441
Gesture estimation time for 8: 0.11639761924743652
Gesture estimation time for 9: 0.11748147010803223
Gesture estimation time for 10: 0.11887741088867188
Gesture estimation time for 11: 0.11744284629821777
Gesture estimation time for 12: 0.11623597145080566
Gesture estimation time for 13: 0.11822962760925293
Gesture estimation time for 14: 0.11979079246520996
Gesture estimation time for 15: 0.11864829063415527
Gesture estimation time for 16: 0.11858391761779785
Gesture estimation time for 17: 0.11903858184814453
Gesture estimation time for 18: 0.1192176342010498
Gesture estimation time for 19: 0.11876654624938965
Gesture estimation time for 20: 0.11921072006225586
Gesture estimation time for 21: 0.11872315406799316
Gesture estimation time for 22: 0.11863470077514648
Gesture estimation time for 23: 0.11790776252746582
Gesture estimation time for 24: 0.11777067184448242
Gesture estimation time for 25: 0.11798357963562012
Gesture estimation time for 26: 0.11849570274353027
Gesture estimation time for 27: 0.1173396110534668
Gesture estimation time for 28: 0.11816287040710449
Gesture estimation time for 29: 0.11847782135009766
Gesture estimation time for 30: 0.11626839637756348
Gesture estimation time for 31: 0.1169431209564209
Gesture estimation time for 32: 0.11779975891113281
Gesture estimation time for 33: 0.11703753471374512
Gesture estimation time for 34: 0.11802911758422852
Gesture estimation time for 35: 0.11674237251281738
Gesture estimation time for 36: 0.1167144775390625
Gesture estimation time for 37: 0.11728119850158691
Gesture estimation time for 38: 0.11744141578674316
Gesture estimation time for 39: 0.11969232559204102
Gesture estimation time for 40: 0.12036442756652832
Gesture estimation time for 41: 0.11762714385986328
Gesture estimation time for 42: 0.1175537109375
Gesture estimation time for 43: 0.11683082580566406
Gesture estimation time for 44: 0.11818766593933105
Gesture estimation time for 45: 0.11913490295410156
Gesture estimation time for 46: 0.1171104907989502
Gesture estimation time for 47: 0.1215047836303711
Gesture estimation time for 48: 0.11718440055847168
Gesture estimation time for 49: 0.117218017578125
Gesture estimation time for 50: 0.11798214912414551
Gesture estimation time for 51: 0.11841845512390137
Gesture estimation time for 52: 0.11738109588623047
Gesture estimation time for 53: 0.11909914016723633
Gesture estimation time for 54: 0.11728310585021973
Gesture estimation time for 55: 0.11827445030212402
Gesture estimation time for 56: 0.1174781322479248
Gesture estimation time for 57: 0.11888456344604492
Gesture estimation time for 58: 0.11926865577697754
Gesture estimation time for 59: 0.11991000175476074
Gesture estimation time for 60: 0.11845636367797852
Gesture estimation time for 61: 0.12073349952697754
Gesture estimation time for 62: 0.11924004554748535
Gesture estimation time for 63: 0.11987900733947754
Gesture estimation time for 64: 0.11944890022277832
Gesture estimation time for 65: 0.11930036544799805
Gesture estimation time for 66: 0.12013530731201172
Gesture estimation time for 67: 0.11808443069458008
Gesture estimation time for 68: 0.11848163604736328
Gesture estimation time for 69: 0.11945652961730957
Gesture estimation time for 70: 0.11721348762512207
Gesture estimation time for 71: 0.11816740036010742
Gesture estimation time for 72: 0.11689329147338867
Gesture estimation time for 73: 0.11688947677612305
Gesture estimation time for 74: 0.1178731918334961
Gesture estimation time for 75: 0.1186518669128418
Gesture estimation time for 76: 0.11655211448669434
Gesture estimation time for 77: 0.1215822696685791
Gesture estimation time for 78: 0.11906218528747559
Gesture estimation time for 79: 0.11992287635803223
Gesture estimation time for 80: 0.11996889114379883
Gesture estimation time for 81: 0.11897993087768555
Gesture estimation time for 82: 0.11900115013122559
Gesture estimation time for 83: 0.1194000244140625
Gesture estimation time for 84: 0.1184232234954834
Gesture estimation time for 85: 0.11848044395446777
Gesture estimation time for 86: 0.11849451065063477
Gesture estimation time for 87: 0.12390995025634766
Gesture estimation time for 88: 0.12093734741210938
Gesture estimation time for 89: 0.12256217002868652
Gesture estimation time for 90: 0.12293481826782227
Gesture estimation time for 91: 0.12177467346191406
Gesture estimation time for 92: 0.11977386474609375
Gesture estimation time for 93: 0.11893701553344727
Gesture estimation time for 94: 0.11681461334228516
Gesture estimation time for 95: 0.12043094635009766
Gesture estimation time for 96: 0.11881470680236816
Gesture estimation time for 97: 0.1202080249786377
Gesture estimation time for 98: 0.11802005767822266
Gesture estimation time for 99: 0.11961078643798828
Gesture estimation time for 100: 0.11961627006530762
Gesture estimation time for 101: 0.12096524238586426
Gesture estimation time for 102: 0.12099480628967285
Gesture estimation time for 103: 0.11967730522155762
Gesture estimation time for 104: 0.11920499801635742
Gesture estimation time for 105: 0.11890125274658203
Gesture estimation time for 106: 0.11861538887023926
Gesture estimation time for 107: 0.1206357479095459
Gesture estimation time for 108: 0.11945581436157227
Gesture estimation time for 109: 0.11902666091918945
Gesture estimation time for 110: 0.11884307861328125
Gesture estimation time for 111: 0.11973237991333008
Gesture estimation time for 112: 0.12136363983154297
Gesture estimation time for 113: 0.11963748931884766
Gesture estimation time for 114: 0.11871147155761719
Gesture estimation time for 115: 0.11790752410888672
Gesture estimation time for 116: 0.11908531188964844
Gesture estimation time for 117: 0.11900639533996582
Gesture estimation time for 118: 0.11840200424194336
Gesture estimation time for 119: 0.12004280090332031
Gesture estimation time for 120: 0.11967015266418457
Gesture estimation time for 121: 0.11923551559448242
Gesture estimation time for 122: 0.11875176429748535
Gesture estimation time for 123: 0.11856651306152344
Gesture estimation time for 124: 0.11831521987915039
Gesture estimation time for 125: 0.11950206756591797
Gesture estimation time for 126: 0.11856722831726074
Gesture estimation time for 127: 0.11959695816040039
Gesture estimation time for 128: 0.11943435668945312
Gesture estimation time for 129: 0.12104606628417969
Gesture estimation time for 130: 0.11899256706237793
Gesture estimation time for 131: 0.11914610862731934
Gesture estimation time for 132: 0.11872172355651855
Gesture estimation time for 133: 0.11937880516052246
Gesture estimation time for 134: 0.11973404884338379
Gesture estimation time for 135: 0.1194467544555664
Gesture estimation time for 136: 0.11856436729431152
Gesture estimation time for 137: 0.11881875991821289
Gesture estimation time for 138: 0.11802196502685547
Gesture estimation time for 139: 0.11901473999023438
Gesture estimation time for 140: 0.11864781379699707
Gesture estimation time for 141: 0.12010073661804199
Gesture estimation time for 142: 0.11844992637634277
Gesture estimation time for 143: 0.11908388137817383
Gesture estimation time for 144: 0.11839795112609863
Gesture estimation time for 145: 0.12024712562561035
Gesture estimation time for 146: 0.11902284622192383
Gesture estimation time for 147: 0.11805129051208496
Gesture estimation time for 148: 0.11713075637817383
The total time we use is:  227.3702414035797
In [12]:
#write images to folder
if FLAGS.DEMO_TYPE.endswith('mp4'):
    for idx, img in enumerate(predictions):
        cv2.imwrite("src/video_dir/labeled_img/" + "/%08d.jpg" % idx, img)
    j2v.jpg_to_video('src/video_dir/labeled_img', 'src/video_dir/output.mp4')
else:
    for idx, img in enumerate(predictions): 
        cv2.imwrite("src/predict_img/" + str(idx) + ".jpg", img)
The output video is src/video_dir/output.mp4
In [13]:
# need to manually change the index, because cv2 on GCP cannot ue imshow, thus only use Image in matplotlib
for idx, img in enumerate(predictions):
    if FLAGS.DEMO_TYPE.endswith('mp4') and idx == 10:
        # there could be hundreds of images for a video, we don't want it all to be seen
        break
    plt.figure(figsize=(12,8))
    plt.imshow(img)